per_read_modified_base_calls.txt but with binary methylation calls using the default cut-offs, i.e., filter out probabilities >20% and <80%, like the bedMethyl files but not aggregated.# see here (https://nanoporetech.github.io/megalodon/file_formats.html) for variable names
library(data.table)
binaryCalls <- fread(file = "/Users/koenvandenberge/data/molly/megalodon_output_06/per_read_modified_base_calls_binary.txt.gz",
nrows = 1e5)
colnames(binaryCalls) <- c("readID", "chr", "strand", "pos",
"mod_log_prob", "can_log_prob", "mod_base")
binaryCalls$methylation <- ifelse(binaryCalls$mod_log_prob > -0.2231436, 1, 0)
table(binaryCalls$methylation)
##
## 0 1
## 89988 10012
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.4
## ✔ tidyr 1.0.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
readCalls <- binaryCalls %>%
group_by(readID) %>%
summarize(avgMeth = mean(methylation),
length = n())
hist(readCalls$avgMeth, breaks=20)
logit <- function(x) log(x / (1-x))
plot(x=readCalls$length, y=logit(readCalls$avgMeth), log="x",
xlab = "Number of bases with confident methylation calls in a read",
ylab = "Logit average methylation")
Below we look at autocorrelation for random reads. It is clear that there is strong autocorrelation. This is very obvious when comparing the autocorrelation function of the observed data versus that of permuted reads.
Note the interesting observation that the ACF function appears bimodal. Clould this be related to nucleosome positions? Since the spacing is unequal across reads, does the multi-modality relate to a particular genomic distance?
rafalib::mypar(mfrow=c(4,4),
mar=c(1,2.5,3.2,1))
#for(rr in 1:27){
for(rr in 100:127){
curAvgMeth <- readCalls$avgMeth[rr]
if(curAvgMeth >0 & curAvgMeth < 1){
rid <- readCalls$readID[rr]
curDf <- binaryCalls[binaryCalls$readID == rid,]
curRange <- range(curDf$pos)
curChr <- unique(curDf$chr)
acf(curDf$methylation, ylim=c(0,1), xlim=c(0,30),
main=paste0("chr", curChr,": ", round(curRange[1]/1e3, digits = 2),
" - ", round(curRange[2]/1e3, digits = 2), "kb"))
} else next
}
rafalib::mypar(mfrow=c(4,4),
mar=c(1,2.5,3.2,1))
#for(rr in 1:27){
for(rr in 100:127){
curAvgMeth <- readCalls$avgMeth[rr]
if(curAvgMeth >0 & curAvgMeth < 1){
rid <- readCalls$readID[rr]
curDf <- binaryCalls[binaryCalls$readID == rid,]
curRange <- range(curDf$pos)
curChr <- unique(curDf$chr)
y <- curDf$methylation
y <- y - mean(y)
maxLag <- 30
n <- length(y)
acf <- c()
for(ll in 1:maxLag){
acf[ll] <- cor(y[1:(n-ll)], y[(ll+1):n])
}
plot(x=1:maxLag, y=acf, type='h', ylim=c(0,1),
main=paste0("chr", curChr,": ", round(curRange[1]/1e3, digits = 2),
" - ", round(curRange[2]/1e3, digits = 2), "kb"))
} else next
}
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero
rafalib::mypar(mfrow=c(4,4))
for(rr in 1:27){
curAvgMeth <- readCalls$avgMeth[rr]
if(curAvgMeth >0 & curAvgMeth < 1){
acf(sample(binaryCalls$methylation[binaryCalls$readID == readCalls$readID[rr]]),
ylim=c(0,1))
} else next
}
There is high autocorrelation on the methylation signal on single reads, at the level of single methylated adenine nucleotides. Note that, this could however be expected. Indeed, Molly suggested that the actual biologically relevant resolution might be the resolution of linker regions inbetween nucleosomes. In order to identify these linker regions, we could use the nucleosome occupancy data from Chereji et al. (2018) and, for example, set a threshold on when a region is ‘not occupied by nucleosomes’, and then calculating autocorrelation between those regions.
meth <- fread("~/data/molly/modified_bases.6mA.bed")
colnames(meth) <- c("chrom", "start", "end", "name", "score",
"strand", "startCodon", "stopCodon", "color",
"coverage", "percentage")
meth_cols <- c("chrom", "start", "coverage", "percentage")
filtered_meth <- meth[coverage > 10, ..meth_cols]
nucs <- fread("~/data/molly/Chereji_Occupancy_rep1_singlebp.bedGraph")
colnames(nucs) <- c("chrom", "start", "end", "occupancy")
#plot percentage methylation as a column chart
HMR <- filtered_meth[chrom == "III" & start > 291e3 & start < 294e3]
HMR_nucs <- nucs[chrom == "chrIII" & start > 291e3 & start < 294e3]
ggplot() +
geom_col(HMR, mapping = aes(x = start, y = percentage), color = "mediumpurple4") +
geom_col(HMR_nucs, mapping = aes(x = start, y = occupancy * 20), color = "black") +
theme_minimal() +
labs(title = "SIR3-EcoGII (JRY13027)",
x = "position on chr III",
y = "% methylated reads")
plot(x=HMR$start, y=rep(-3,3,length.out=nrow(HMR)),
type="n", ylim=c(-3,3),
ylab="Scaled average methylation / occupancy",
xlab="Chromosome position")
# Methylation
loMethyl <- loess(percentage/100 ~ start, data=HMR, weights=HMR$coverage, enp.target = 50)
lines(x=loMethyl$x[order(loMethyl$x)], y=scale(loMethyl$fitted[order(loMethyl$x)]),
col="orange", lwd=4)
# Nucleosome occupancy
loOccup <- loess(occupancy/max(occupancy) ~ start, data=HMR_nucs, enp.target = 50)
lines(x=loOccup$x, y=scale(loOccup$fitted), col="darkseagreen3", lwd=4)
legend("topleft", c("Methylation", "Nucleosome occupancy"),
col=c("orange", "darkseagreen3"), lty=1, cex=1,
bty='n', lwd=4)
# Set arbitrary threshold for defining linnker regions
plot(HMR_nucs$start, HMR_nucs$occupancy, pch=16, cex=1/3)
abline(h=0.5, lty=2, col="red")
# Aggregate methylation calls at the level of linker regions
FtoT <- which(diff(HMR_nucs$occupancy < 1/2) == 1)
TtoF <- which(diff(HMR_nucs$occupancy < 1/2) == -1)
dfLink <- data.frame(start = FtoT,
end = TtoF)
# only keep linker regions with at least 5bp in size
dfLink <- dfLink[(dfLink$end - dfLink$start) >=5,]
plot(HMR_nucs$start, HMR_nucs$occupancy, pch=16, cex=1/3)
abline(h=0.5, lty=2, col="red")
abline(v=HMR_nucs$start[dfLink$start], col="blue")
abline(v=HMR_nucs$start[dfLink$end], col="green")
binaryCalls <- fread(file = "/Users/koenvandenberge/data/molly/megalodon_output_06/per_read_modified_base_calls_binary.txt.gz",
nrows = 6e6, skip=11e6)
colnames(binaryCalls) <- c("readID", "chr", "strand", "pos",
"mod_log_prob", "can_log_prob", "mod_base")
table(binaryCalls$chr)
##
## II III IV
## 1606661 3632511 760828
binaryCalls$methylation <- ifelse(binaryCalls$mod_log_prob > -0.2231436, 1, 0)
binaryCalls3 <- binaryCalls[binaryCalls$chr == "III",]
## organize reads
readInfo <- binaryCalls3 %>%
group_by(readID) %>%
summarize(start=min(pos),
end=max(pos),
chr=unique(chr))
startLink <- HMR_nucs$start[dfLink$start[1]]
endLink <- HMR_nucs$start[dfLink$end[nrow(dfLink)]]
# overlappingReads <- readInfo[(readInfo$end >= 291e3 & readInfo$start <=294e3) | (readInfo$start < 294e3 & readInfo$start > 290e3),]
overlappingReads <- readInfo[(readInfo$end >= 291e3 & readInfo$start <=294e3),]
readAvMethList <- list()
for(rr in 1:nrow(overlappingReads)){
curData <- binaryCalls3[binaryCalls3$readID == overlappingReads$readID[rr],]
avMethLinkers <- c()
for(bb in 1:nrow(dfLink)){
curStart <- HMR_nucs$start[dfLink$start[bb]]
curEnd <- HMR_nucs$end[dfLink$end[bb]]
curId <- curData$pos >= curStart & curData$pos <= curEnd
if(sum(curId) > 0){
curBinData <- curData[curData$pos >= curStart & curData$pos <= curEnd,]
avMethLinkers[bb] <- mean(curBinData$methylation)
names(avMethLinkers)[bb] <- nrow(curBinData) #coverage
} else {
avMethLinkers[bb] <- NA
}
}
readAvMethList[[rr]] <- avMethLinkers
}
avMethMat <- do.call(rbind, readAvMethList)
colSums(is.na(avMethMat)) # often no data in defined linker regions
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 1 1
## 34 19 37 21 54 86 27 31 28 27 49 42
# focus on 24 reads with data in all linker regions
readAvMethListComplete <- readAvMethList[rowSums(is.na(avMethMat)) == 0]
rafalib::mypar(mfrow=c(2,2))
for(ll in 1:length(readAvMethListComplete)){
y <- readAvMethListComplete[[ll]]
if(var(y)>0){
acf(y)
}
}
# # all reads
# rafalib::mypar(mfrow=c(4,4))
# for(ll in 1:length(readAvMethList)){
# y <- readAvMethList[[ll]]
# y <- y[!is.na(y)]
# if(length(y) > 3 & var(y)>0){
# acf(y)
# }
# }
nucDat <- openxlsx::read.xlsx("../../data/HMR_HML_nucleosome positions.xlsx")
nucDatHMR <- nucDat[nucDat$region == "HMR",]
overlappingReadsDyad <- readInfo[(readInfo$end >= min(nucDatHMR$dyad) & readInfo$start <= max(nucDatHMR$dyad)),]
dfLinkDyad <- data.frame(start = nucDatHMR$dyad[-nrow(nucDatHMR)]+1,
stop = nucDatHMR$dyad[2:nrow(nucDatHMR)])
readAvMethListDyad <- list()
for(rr in 1:nrow(overlappingReadsDyad)){
curData <- binaryCalls3[binaryCalls3$readID == overlappingReadsDyad$readID[rr],]
avMethLinkers <- c()
for(bb in 1:nrow(dfLinkDyad)){
curStart <- dfLinkDyad$start[bb]
curEnd <- dfLinkDyad$stop[bb]
curId <- curData$pos >= curStart & curData$pos <= curEnd
if(sum(curId) > 0){
curBinData <- curData[curData$pos >= curStart & curData$pos <= curEnd,]
avMethLinkers[bb] <- mean(curBinData$methylation)
names(avMethLinkers)[bb] <- nrow(curBinData) #coverage
} else {
avMethLinkers[bb] <- NA
}
}
readAvMethListDyad[[rr]] <- avMethLinkers
}
avMethMatDyad <- do.call(rbind, readAvMethListDyad)
colSums(is.na(avMethMatDyad)) # often no data in defined linker regions
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 1 37 24 23 30 38
## 57 67 68 67 76 72 74 76 75 78 78 77 79 78 78 79
## 2
## 78 79 70 73 72 63 63 61 63 68 70 66 69
# focus on 62 reads with data in all linker regions
readAvMethListDyadComplete <- readAvMethListDyad[rowSums(is.na(avMethMatDyad)) == 0]
rafalib::mypar(mfrow=c(2,2))
for(ll in 1:length(readAvMethListDyadComplete)){
y <- readAvMethListDyadComplete[[ll]]
if(var(y)>0){
acf(y)
}
}
# when permuting the data
rafalib::mypar(mfrow=c(2,2))
for(ll in 1:length(readAvMethListDyadComplete)){
y <- readAvMethListDyadComplete[[ll]]
if(var(y)>0){
acf(sample(y))
}
}